# Load library
library(dplyr)
library(survival)
library(janitor)
library(magrittr)
library(car)
library(ggplot2)
library(tidyverse)
library(broom)
library(MASS)
library(boot)
#print(citation("survival"), bibtex=TRUE)Cox regression modeling of survival
Data: Survival after chemotherapy for Stage B/C colon cancer [1] [2]
Description
These are data from one of the first successful trials of adjuvant chemotherapy for colon cancer. Levamisole is a low-toxicity compound previously used to treat worm infestations in animals; 5-FU is a moderately toxic (as these things go) chemotherapy agent. There are two records per person, one for recurrence and one for death.
The purpose of this project is to compare survival between the untreated (Obs) group vs those treated with amisole (Lev), or amisole + 5-FU.
Column names:
| id: | id |
| study: | 1 for all patients |
| rx: | Treatment - Obs(ervation), Lev(amisole), Lev(amisole)+5-FU |
| sex: | 1=male |
| age: | in years |
| obstruct: | obstruction of colon by tumour |
| perfor: | perforation of colon |
| adhere: | adherence to nearby organs |
| nodes: | number of lymph nodes with detectable cancer |
| time: | days until event or censoring |
| status: | censoring status |
| differ: | differentiation of tumour (1=well, 2=moderate, 3=poor) |
| extent: | Extent of local spread (1=submucosa, 2=muscle, 3=serosa, 4=contiguous structures) |
| surg: | time from surgery to registration (0=short, 1=long) |
| node4: | more than 4 positive lymph nodes |
| etype: | event type: 1=recurrence,2=death |
#Load data
colon <- as_tibble(colon)
head(colon)# A tibble: 6 × 16
id study rx sex age obstruct perfor adhere nodes status differ
<dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 Lev+5FU 1 43 0 0 0 5 1 2
2 1 1 Lev+5FU 1 43 0 0 0 5 1 2
3 2 1 Lev+5FU 1 63 0 0 0 1 0 2
4 2 1 Lev+5FU 1 63 0 0 0 1 0 2
5 3 1 Obs 0 71 0 0 1 7 1 2
6 3 1 Obs 0 71 0 0 1 7 1 2
# ℹ 5 more variables: extent <dbl>, surg <dbl>, node4 <dbl>, time <dbl>,
# etype <dbl>
Since the current analysis is focused on survival, filter data to death as the event type. This will create a data table with one row per individual.
colon_surv <- colon%>%filter(etype == 2) Identify participants who had recurrence. Identify those not censored for recurrence event. Filter event type = 1 (recurrence), status = 1.
recurrence <- colon%>%filter(etype == 1 & status == 1)%>%dplyr::select(id)
recurrence <- recurrence%>%mutate(recurrence = 1) # list of patients with recurrence
colon_surv <- colon_surv%>%merge(recurrence, by = "id", all.x = TRUE)
colon_surv$recurrence[is.na(colon_surv$recurrence)] <- 0I. Exploratory data analysis
Check missing values
na_counts <- sapply(colon_surv, function(x)sum(is.na(x)))
na_counts id study rx sex age obstruct perfor
0 0 0 0 0 0 0
adhere nodes status differ extent surg node4
0 18 0 23 0 0 0
time etype recurrence
0 0 0
# replace NAs with mode
table(colon_surv$differ)
1 2 3
93 663 150
mode(colon_surv$differ)[1] "numeric"
median(colon_surv$nodes, na.rm= TRUE)[1] 2
colon_surv$differ <- if_else(is.na(colon_surv$differ), 2,colon_surv$differ)
colon_surv$nodes <- if_else(is.na(colon_surv$nodes), 2,colon_surv$nodes)Insight: only nodes and differ columns have NA values. Replacing the 23 NAs in differ column with mode, and replace NAs in nodes with median.
Evaluate continuous variables
# age
hist(colon_surv$age)hist(colon_surv$nodes)hist(colon_surv$time)Insight: Age is normally distribute. Number of nodes is skewed to the right. Time is fairly normally distributed with most the individuals had event time between 500-3000 days.
Evaluate nodes column to investigate outliers
t <- colon_surv%>%filter(node4 ==1) # samples with more than positive lymph nodes
hist(t$nodes) Insight: samples with greater than 4 lymph nodes have less than 5 count in nodes column, so the two columns are not consistent. Therefore, nodes column will not be used for further analysis.
Evaluate categorical variables
summary_table <- colon_surv%>%summarise(count =n(),
male = sum(sex),
median_age = median(age),
ct_perforation = sum(perfor),
ct_adherence_nerby_organ = sum(adhere))
summary_table count male median_age ct_perforation ct_adherence_nerby_organ
1 929 484 61 27 135
Insight: Total number of participants: 929. About half of the participants are male and about half were censored, while the other half died.
colon_surv <- colon_surv%>%mutate(differentiation = case_when(differ == 1 ~ "well",
differ == 2 ~ "moderate",
differ == 3 ~ "poor"),
local_spread = case_when(extent == 1 ~ "submucosa",
extent == 2 ~ "muscle",
extent == 3 ~ "serosa",
extent == 4 ~ "contiguous"),
surg_to_reg_time = case_when(surg == 0~ "short",
surg == 1 ~ "long"))Frequency tables for categorical variables
# frequency tables for categorical variables
# Tumor differentiation
colon_surv %>%
tabyl(differentiation, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() differentiation Obs Lev Lev+5FU
moderate 74.9% (236) 73.9% (229) 72.7% (221)
poor 16.5% (52) 14.2% (44) 17.8% (54)
well 8.6% (27) 11.9% (37) 9.5% (29)
# extent of local spread
colon_surv %>%
tabyl(local_spread, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() local_spread Obs Lev Lev+5FU
contiguous 6.3% (20) 3.9% (12) 3.6% (11)
muscle 12.1% (38) 11.6% (36) 10.5% (32)
serosa 79.0% (249) 83.5% (259) 82.6% (251)
submucosa 2.5% (8) 1.0% (3) 3.3% (10)
# colum obstruction
colon_surv %>%
tabyl(obstruct, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() obstruct Obs Lev Lev+5FU
0 80.0% (252) 79.7% (247) 82.2% (250)
1 20.0% (63) 20.3% (63) 17.8% (54)
# colon perforation
colon_surv %>%
tabyl(perfor, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() perfor Obs Lev Lev+5FU
0 97.1% (306) 96.8% (300) 97.4% (296)
1 2.9% (9) 3.2% (10) 2.6% (8)
# Adherance to nearby organs
colon_surv %>%
tabyl(adhere, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() adhere Obs Lev Lev+5FU
0 85.1% (268) 84.2% (261) 87.2% (265)
1 14.9% (47) 15.8% (49) 12.8% (39)
# extent of local tumor spread
colon_surv %>%
tabyl(local_spread, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() local_spread Obs Lev Lev+5FU
contiguous 6.3% (20) 3.9% (12) 3.6% (11)
muscle 12.1% (38) 11.6% (36) 10.5% (32)
serosa 79.0% (249) 83.5% (259) 82.6% (251)
submucosa 2.5% (8) 1.0% (3) 3.3% (10)
# More than 4 lymph nodes with cancer
colon_surv %>%
tabyl(node4, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() node4 Obs Lev Lev+5FU
0 72.4% (228) 71.3% (221) 74.0% (225)
1 27.6% (87) 28.7% (89) 26.0% (79)
# Recurrence
colon_surv %>%
tabyl(recurrence, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() recurrence Obs Lev Lev+5FU
0 43.8% (138) 44.5% (138) 60.9% (185)
1 56.2% (177) 55.5% (172) 39.1% (119)
colon_surv %>%
tabyl(recurrence, status) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() recurrence 0 1
0 88.7% (423) 8.4% (38)
1 11.3% (54) 91.6% (414)
# time from surgery to registration
colon_surv %>%
tabyl(surg, rx) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns() surg Obs Lev Lev+5FU
0 71.1% (224) 74.2% (230) 75.0% (228)
1 28.9% (91) 25.8% (80) 25.0% (76)
Summary statistics grouped by treatment
summary_table <- colon_surv%>%group_by(rx)%>%summarise(count =n(),
male = sum(sex),
median_age = median(age),
ct_perforation = sum(perfor),
ct_adherence_nerby_organ = sum(adhere),
perc_male = (male/count)*100,
iqr_age = IQR(age))
summary_table# A tibble: 3 × 8
rx count male median_age ct_perforation ct_adherence_nerby_o…¹ perc_male
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Obs 315 166 60 9 47 52.7
2 Lev 310 177 61 10 49 57.1
3 Lev+5FU 304 141 62 8 39 46.4
# ℹ abbreviated name: ¹ct_adherence_nerby_organ
# ℹ 1 more variable: iqr_age <dbl>
g <- colon_surv%>%filter(rx == "Lev+5FU")
summary(g$age) Min. 1st Qu. Median Mean 3rd Qu. Max.
26.0 52.0 62.0 59.7 70.0 81.0
Insight: Each treatment group had about 300 participants. Median age, number of participants with perforation and adherence are similar between the three groups.
II. Table 1: Description of the study population
| Observation (%) | Amisole (%) | Amisole + 5-FU (%) | ||
|---|---|---|---|---|
| N=315 | N=310 | N=304 | ||
| Demographics | ||||
| Male | 166 (52.3) | 177 (57.1) | 141 | |
| Median age (years) [IQR] | 60 [53,68] | 61 [53,69] | 61 [52,70] | |
| Cancer characteristics | ||||
| Colon obstruction | 63 (20.0) | 63 (20.3) | 54 (17.8) | |
| Colon perforation | 9 (2.9) | 10 (3.2) | 8 (2.6) | |
| Adherence to nearby organs | 47 (14.9) | 49 (15.8) | 39 (12.8) | |
| Differentiation of tumor | ||||
| Well | 27 (8.6) | 37 (11.9) | 29 (9.5) | |
| Moderate | 236 (74.9) | 229 (73.9) | 221 (72.7) | |
| Poor | 52 (16.5) | 44 (14.2) | 54 (17.8) | |
| Extent of local spread | ||||
| Contiguous | 20 (6.3) | 12 (3.9) | 11 (3.6) | |
| Muscle | 38 (12.1) | 36 (11.6) | 32 (10.5) | |
| Serosa | 249 (79.0) | 259 (83.5) | 251 (82.6) | |
| Submucosa | 8 (2.5) | 3 (1.0) | 10 (3.3) | |
| More than 4 lymph nodes with cancer | Yes | 87 (27.6) | 89 (28.7) | 79 (26.0) |
| Recurrence (%) | Yes | 177 (56.2) | 172 (55.5) | 119 (39.1) |
| Short time from surgery to registration (%) | Yes | 91 (28.9) | 80 (25.8) | 76 (25.0) |
III. Methods
The Cox proportional hazards model was used to model the relationship between survival time and different lung cancer treatments. In particlular the survival time will be compared between the untreated group (observation) vs. those treated with amisole (Lev), or amisole + 5-FU. The Cox regression model was chosen for this study because it is useful for studying association between survival time of patients and predictors and allows estimating the relative risk or hazard ratios due to the covariates, i.e., treatment status. The time (in days) until event, i.e, death, will be modeled as a function of treatment and other variables, including age, sex, and various tumor characteristics. Significant predictors were included in the final model.
Statistical analysis
The R statistical software version 4.3.2 [3] was used for all analysis. The Survival package was used to construct the Cox regression model [2] [1].
Cox regression model is based on the hazard function \(h_x(t)\) with covariates at time t given by:
\(h_x(t)=h_0(t)\exp(\beta_1x_1 +\beta_2x_2 + \dots + \beta_p x_p)\)
Where:
\(h_x(t)\) is the hazard function
\(h_0(t)\) is the baseline hazard function
\(\beta_1x_1 + \beta_2x_2 + \dots +\beta_p x_p\) represent the linear combination of covariates and their coefficient
The hazards for the observation vs. amisole (Lev), or amisole + 5-FU group with covariate values x1 and x2 are given by: \(hx_1(t)=h_0(t)\exp(\beta_1x_1)\) and \(hx_2(t)=h_0(t)\exp(\beta_2x_2)\), respectively
The hazard ratio is expressed as: HR = \(hx_2(t)\) / \(hx_1(t)\) = \(\exp[\beta(x_2-x_1)]\)
The Schoenfeld residual plot was constructed to test Cox proportional hazards assumption. When the proportional hazards assumpiton was not met for any of the covariates, stratification approach was explored. The Survminer [4] package was used to plot the Kaplan-Meier curve to visualize the survival probability over time for each treatment group.
Multicolinearity was tested using Variant Inflation Factor (VIF) calculated using MASS package [5].
The R MASS package was used for Stepwise model selection, using “both” forward and backward variable selection [5]. For Stepwise selection, stepAIC() function uses AIC (Akaike Information Criterion) as the measure to add or remove predictors from the model. Model performance was evaluated using 100-fold cross-validation using the boot package [6] [7].
IV. Analysis: Cox regression model
Plot survival times
# Create new incremental count id
colon_surv$idcount <- c(1:length(colon_surv$id))
# Order by survival time and create an order variable:
colon_surv <- colon_surv[order(-colon_surv$time, colon_surv$status),]
colon_surv$order <- c(1:length(colon_surv$idcount))
ggplot(data=colon_surv, aes(x=time, y=order)) +
geom_rect(xmin=23,xmax=colon_surv$time,ymin=colon_surv$order,ymax=colon_surv$order+1, colour="lightgray") +
geom_rect(xmin=colon_surv$time-2,xmax=colon_surv$time,ymin=colon_surv$order,ymax=colon_surv$order+1,
fill=factor(colon_surv$status+1)) +
geom_vline(xintercept= 1976,linetype="solid") +
scale_x_continuous(breaks=seq(20,3330,650)) +
geom_text(aes(2600, 750, label="Median Survival Time")) +
xlab("Survival Time (Days)") + ylab("Participants (ordered by survival time)") +
ggtitle("Survival Times for Participant") +
theme_classic() +
theme(legend.position="none",
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.background=element_blank(),
axis.line.x = element_line(color = "black"),
axis.line.y = element_line(color = "black"))Plot survival curve stratified by treatment group
library(survminer)
library(survival)
# Estimate the median survival time among the three groups
survfit(Surv(time,status) ~ rx, data = colon_surv)Call: survfit(formula = Surv(time, status) ~ rx, data = colon_surv)
n events median 0.95LCL 0.95UCL
rx=Obs 315 168 2083 1656 2789
rx=Lev 310 161 2152 1540 NA
rx=Lev+5FU 304 123 NA 2725 NA
# count the number of events after 2080 days, which is the median survival time among the observation group
tt <- colon_surv%>%filter(time > 2083)%>% group_by(rx)%>%summarise(ct = n(),
death = sum(status))
# Plot survival curve
fit <- survfit(Surv(time,status) ~ rx, data = colon_surv)
ggsurvplot(fit, data=colon_surv, risk.table = TRUE)# Estimate the probability of surviving beyond 3000 days
summary(survfit(Surv(time, status) ~ rx, data = colon_surv), times = 3000)Call: survfit(formula = Surv(time, status) ~ rx, data = colon_surv)
rx=Obs
time n.risk n.event survival std.err lower 95% CI
3.00e+03 6.00e+00 1.68e+02 4.08e-01 3.97e-02 3.37e-01
upper 95% CI
4.94e-01
rx=Lev
time n.risk n.event survival std.err lower 95% CI
3.00e+03 4.00e+00 1.61e+02 3.92e-01 5.63e-02 2.96e-01
upper 95% CI
5.20e-01
rx=Lev+5FU
time n.risk n.event survival std.err lower 95% CI
3.00e+03 7.00e+00 1.23e+02 5.61e-01 3.43e-02 4.97e-01
upper 95% CI
6.32e-01
# compare significant diffeerence is survival times between the three groups
survdiff(Surv(time, status)~ rx, data = colon_surv)Call:
survdiff(formula = Surv(time, status) ~ rx, data = colon_surv)
N Observed Expected (O-E)^2/E (O-E)^2/V
rx=Obs 315 168 148 2.58 3.85
rx=Lev 310 161 146 1.52 2.25
rx=Lev+5FU 304 123 157 7.55 11.62
Chisq= 11.7 on 2 degrees of freedom, p= 0.003
Insight: Based on the survival curve, the mediant survival time for the observation group is 2083 days. However, the median survival of Lev and Lev+5Fu group cannot be estimated, because there are too few events after 2083 days, which is the median survival time in the observation group.
The time for 50% survival probability of the group treated with Lev+5Fu is over 3000 days while the survival time for the observation and Lev group is around 2080 days. The probability of surviving to 3000 days among the Lev+5FU group is 56% (95% CI: 50-63), compared to 41% among the observation group.
The survival time is significantly different (P=0.003) between the three groups.
Fit base Model
m0 <- coxph(Surv(time, status) ~ 1, data = colon_surv)
summary(m0)Call: coxph(formula = Surv(time, status) ~ 1, data = colon_surv)
Null model
log likelihood= -2930.192
n= 929
Perform cox regression to test whether treatment is significant predictor
#| echo: true
#| message: false
#| warning: false
# Univariate analysis
m1 <- coxph(Surv(time, status) ~ rx, data = colon_surv)
summary(m1)Call:
coxph(formula = Surv(time, status) ~ rx, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.02664 0.97371 0.11030 -0.241 0.80917
rxLev+5FU -0.37171 0.68955 0.11875 -3.130 0.00175 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9737 1.027 0.7844 1.2087
rxLev+5FU 0.6896 1.450 0.5464 0.8703
Concordance= 0.536 (se = 0.013 )
Likelihood ratio test= 12.15 on 2 df, p=0.002
Wald test = 11.56 on 2 df, p=0.003
Score (logrank) test = 11.68 on 2 df, p=0.003
Insight: the coefficient of Lev is not significant, suggesting that there is no evidence that this treatment affects survival time compared to observation. however Lev+5Fu is significant (p=0.00175), indicating that the treatment Lev +5Fu has a statistically significant effect on survival time compared to the reference group. The negative sign indicates that this treatment group has a lower hazard and likely a longer survival time.
The hazard ratio for Lex+5FU (0.690), indicating the risk of death is about 31% lower compared to the observation group.
The p-values indicate that the model is significant.
Test the Cox proportional hazard assumption
cox.zph(m1) chisq df p
rx 1.48 2 0.48
GLOBAL 1.48 2 0.48
zph_test <- cox.zph(m1)
print(zph_test) chisq df p
rx 1.48 2 0.48
GLOBAL 1.48 2 0.48
# plot the Schoenfeld residuals
plot(zph_test)Insight: The Schoenfeld residal plot shows that the residuals are scattered randomly and the smooth trend line is horizontal near 0. This suggests that the hazard ratio for rx (treatment status) is constant over time and the proportional hazard assumption is met. The global p-value is >0.05, indicating that the the assumption is met.
Perform multivariate analysis, including all variables to determine which predictors are significant.
# Subset data for modeling
df <- colon_surv%>%dplyr::select(!c(id,study,etype,differ,recurrence, extent,surg_to_reg_time, idcount, order, nodes))
# multivariant analysis
m2 <- coxph(Surv(time, status) ~ rx+ age + sex + perfor + adhere + surg + obstruct + differentiation + node4+
local_spread, data = df)
summary(m2)Call:
coxph(formula = Surv(time, status) ~ rx + age + sex + perfor +
adhere + surg + obstruct + differentiation + node4 + local_spread,
data = df)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.0160445 0.9840835 0.1115300 -0.144 0.885612
rxLev+5FU -0.3742695 0.6877915 0.1196533 -3.128 0.001760 **
age 0.0070459 1.0070708 0.0040596 1.736 0.082633 .
sex 0.0412783 1.0421421 0.0952633 0.433 0.664791
perfor 0.0005371 1.0005373 0.2695697 0.002 0.998410
adhere 0.1689086 1.1840119 0.1295242 1.304 0.192210
surg 0.2362177 1.2664500 0.1032961 2.287 0.022207 *
obstruct 0.2865577 1.3318350 0.1173103 2.443 0.014576 *
differentiationpoor 0.3579964 1.4304604 0.1222148 2.929 0.003398 **
differentiationwell 0.0812878 1.0846830 0.1662752 0.489 0.624930
node4 0.9353140 2.5480134 0.0988712 9.460 < 2e-16 ***
local_spreadmuscle -0.9503933 0.3865890 0.2554577 -3.720 0.000199 ***
local_spreadserosa -0.4526719 0.6359268 0.1986566 -2.279 0.022687 *
local_spreadsubmucosa -1.2432513 0.2884449 0.5396100 -2.304 0.021224 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9841 1.0162 0.7909 1.2245
rxLev+5FU 0.6878 1.4539 0.5440 0.8696
age 1.0071 0.9930 0.9991 1.0151
sex 1.0421 0.9596 0.8646 1.2561
perfor 1.0005 0.9995 0.5899 1.6970
adhere 1.1840 0.8446 0.9186 1.5262
surg 1.2664 0.7896 1.0343 1.5507
obstruct 1.3318 0.7508 1.0583 1.6761
differentiationpoor 1.4305 0.6991 1.1258 1.8176
differentiationwell 1.0847 0.9219 0.7830 1.5026
node4 2.5480 0.3925 2.0991 3.0929
local_spreadmuscle 0.3866 2.5867 0.2343 0.6378
local_spreadserosa 0.6359 1.5725 0.4308 0.9387
local_spreadsubmucosa 0.2884 3.4669 0.1002 0.8306
Concordance= 0.674 (se = 0.012 )
Likelihood ratio test= 147 on 14 df, p=<2e-16
Wald test = 150.3 on 14 df, p=<2e-16
Score (logrank) test = 161.3 on 14 df, p=<2e-16
# Determine significant predictors
anova(m2)Analysis of Deviance Table
Cox model: response is Surv(time, status)
Terms added sequentially (first to last)
loglik Chisq Df Pr(>|Chi|)
NULL -2930.2
rx -2924.1 12.1478 2 0.0023022 **
age -2923.9 0.3443 1 0.5573434
sex -2923.9 0.0000 1 0.9964141
perfor -2923.8 0.3345 1 0.5630435
adhere -2921.3 5.0032 1 0.0253001 *
surg -2919.1 4.3247 1 0.0375630 *
obstruct -2916.9 4.4808 1 0.0342771 *
differentiation -2909.8 14.0968 2 0.0008688 ***
node4 -2865.5 88.5800 1 < 2.2e-16 ***
local_spread -2856.7 17.6698 3 0.0005145 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Insight: When all variables are included in the model, the anova test indicates that rx, adhere, surg, obstruct, differentiation, node4 and local spread are significant predictors.
The concordance of the multivariable model, 0.674, is higher than the univariate model (m1, concordance =0.53), suggesting that the multivariate model is a better fit model.
Calculate Variance Inflation Factor (VIF) to assess multicollinearity among predictors
vif <- vif(m2)
print(vif) GVIF Df GVIF^(1/(2*Df))
rx 1.035127 2 1.008668
age 1.042883 1 1.021217
sex 1.022620 1 1.011247
perfor 1.053420 1 1.026363
adhere 1.092721 1 1.045333
surg 1.013876 1 1.006914
obstruct 1.055334 1 1.027295
differentiation 1.062714 2 1.015323
node4 1.046133 1 1.022806
local_spread 1.091169 3 1.014648
Insight: None of the variables have VIF values above 5, therefore there is no multicollinearity
Variable selection method 1. Model survival while including different cancer characteristics as predictors separately to identify significance predictors.
m2 <- coxph(Surv(time, status) ~ rx+ age + sex + perfor + adhere + surg + obstruct + differentiation + node4+
local_spread, data = df)
m3 <- coxph(Surv(time, status) ~ rx + adhere + surg + obstruct + differentiation + node4+
local_spread, data = df)
summary(m3)Call:
coxph(formula = Surv(time, status) ~ rx + adhere + surg + obstruct +
differentiation + node4 + local_spread, data = df)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.01097 0.98909 0.11138 -0.098 0.921547
rxLev+5FU -0.37209 0.68929 0.11950 -3.114 0.001848 **
adhere 0.18378 1.20175 0.12788 1.437 0.150677
surg 0.23708 1.26754 0.10320 2.297 0.021605 *
obstruct 0.25643 1.29231 0.11520 2.226 0.026018 *
differentiationpoor 0.36574 1.44159 0.12195 2.999 0.002708 **
differentiationwell 0.06933 1.07179 0.16575 0.418 0.675751
node4 0.91247 2.49046 0.09782 9.328 < 2e-16 ***
local_spreadmuscle -0.92237 0.39758 0.25472 -3.621 0.000293 ***
local_spreadserosa -0.43266 0.64878 0.19829 -2.182 0.029111 *
local_spreadsubmucosa -1.25536 0.28497 0.53950 -2.327 0.019970 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9891 1.0110 0.79512 1.2304
rxLev+5FU 0.6893 1.4508 0.54536 0.8712
adhere 1.2017 0.8321 0.93533 1.5440
surg 1.2675 0.7889 1.03542 1.5517
obstruct 1.2923 0.7738 1.03112 1.6197
differentiationpoor 1.4416 0.6937 1.13510 1.8308
differentiationwell 1.0718 0.9330 0.77450 1.4832
node4 2.4905 0.4015 2.05594 3.0168
local_spreadmuscle 0.3976 2.5152 0.24132 0.6550
local_spreadserosa 0.6488 1.5414 0.43986 0.9569
local_spreadsubmucosa 0.2850 3.5091 0.09899 0.8204
Concordance= 0.671 (se = 0.012 )
Likelihood ratio test= 143.8 on 11 df, p=<2e-16
Wald test = 146.7 on 11 df, p=<2e-16
Score (logrank) test = 157.7 on 11 df, p=<2e-16
anova(m3)Analysis of Deviance Table
Cox model: response is Surv(time, status)
Terms added sequentially (first to last)
loglik Chisq Df Pr(>|Chi|)
NULL -2930.2
rx -2924.1 12.1478 2 0.0023022 **
adhere -2921.4 5.4056 1 0.0200728 *
surg -2919.3 4.3145 1 0.0377883 *
obstruct -2917.1 4.2819 1 0.0385198 *
differentiation -2910.1 14.1346 2 0.0008525 ***
node4 -2866.9 86.2158 1 < 2.2e-16 ***
local_spread -2858.3 17.2535 3 0.0006268 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Treatment
m2a <- coxph(Surv(time, status) ~ rx, data = colon_surv) # significant
summary(m2a)Call:
coxph(formula = Surv(time, status) ~ rx, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.02664 0.97371 0.11030 -0.241 0.80917
rxLev+5FU -0.37171 0.68955 0.11875 -3.130 0.00175 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9737 1.027 0.7844 1.2087
rxLev+5FU 0.6896 1.450 0.5464 0.8703
Concordance= 0.536 (se = 0.013 )
Likelihood ratio test= 12.15 on 2 df, p=0.002
Wald test = 11.56 on 2 df, p=0.003
Score (logrank) test = 11.68 on 2 df, p=0.003
# Demographics
m2b <- coxph(Surv(time, status) ~ age + sex, data = colon_surv) # not significant
summary(m2b)Call:
coxph(formula = Surv(time, status) ~ age + sex, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
age 0.001951 1.001952 0.004031 0.484 0.628
sex 0.012955 1.013040 0.094205 0.138 0.891
exp(coef) exp(-coef) lower .95 upper .95
age 1.002 0.9981 0.9941 1.010
sex 1.013 0.9871 0.8422 1.218
Concordance= 0.511 (se = 0.014 )
Likelihood ratio test= 0.26 on 2 df, p=0.9
Wald test = 0.25 on 2 df, p=0.9
Score (logrank) test = 0.25 on 2 df, p=0.9
# cancer characterstics
m2c <- coxph(Surv(time, status) ~ perfor + adhere + obstruct, data = colon_surv) # adhere and obstruct are significant
summary(m2c)Call:
coxph(formula = Surv(time, status) ~ perfor + adhere + obstruct,
data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
perfor -0.03415 0.96643 0.26932 -0.127 0.8991
adhere 0.31011 1.36358 0.12572 2.467 0.0136 *
obstruct 0.25794 1.29426 0.11547 2.234 0.0255 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
perfor 0.9664 1.0347 0.5701 1.638
adhere 1.3636 0.7334 1.0658 1.745
obstruct 1.2943 0.7726 1.0321 1.623
Concordance= 0.536 (se = 0.012 )
Likelihood ratio test= 10.82 on 3 df, p=0.01
Wald test = 11.51 on 3 df, p=0.009
Score (logrank) test = 11.6 on 3 df, p=0.009
# Differentiation of tumor
m2d <- coxph(Surv(time, status) ~ differentiation, data = colon_surv) # significant
summary(m2d)Call:
coxph(formula = Surv(time, status) ~ differentiation, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
differentiationpoor 0.48265 1.62036 0.12040 4.009 6.1e-05 ***
differentiationwell -0.05027 0.95097 0.16408 -0.306 0.759
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
differentiationpoor 1.620 0.6171 1.2798 2.052
differentiationwell 0.951 1.0516 0.6895 1.312
Concordance= 0.544 (se = 0.012 )
Likelihood ratio test= 15.34 on 2 df, p=5e-04
Wald test = 16.97 on 2 df, p=2e-04
Score (logrank) test = 17.31 on 2 df, p=2e-04
# Extent of local spread
m2f <- coxph(Surv(time, status) ~ local_spread, data = colon_surv) # significant
summary(m2f)Call:
coxph(formula = Surv(time, status) ~ local_spread, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
local_spreadmuscle -1.0892 0.3365 0.2498 -4.361 1.29e-05 ***
local_spreadserosa -0.5043 0.6039 0.1927 -2.617 0.00888 **
local_spreadsubmucosa -1.7283 0.1776 0.5336 -3.239 0.00120 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
local_spreadmuscle 0.3365 2.972 0.2062 0.5490
local_spreadserosa 0.6039 1.656 0.4139 0.8811
local_spreadsubmucosa 0.1776 5.631 0.0624 0.5054
Concordance= 0.552 (se = 0.009 )
Likelihood ratio test= 29.21 on 3 df, p=2e-06
Wald test = 25.35 on 3 df, p=1e-05
Score (logrank) test = 26.94 on 3 df, p=6e-06
# Recurrence
# m2g <- coxph(Surv(time, status) ~ recurrence, data = colon_surv) # significant
# summary(m2g)
# short time from surgery to registration
m2h <- coxph(Surv(time, status) ~ surg, data = colon_surv) # significant
summary(m2h)Call:
coxph(formula = Surv(time, status) ~ surg, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
surg 0.2333 1.2627 0.1026 2.274 0.0229 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
surg 1.263 0.7919 1.033 1.544
Concordance= 0.523 (se = 0.011 )
Likelihood ratio test= 5.01 on 1 df, p=0.03
Wald test = 5.17 on 1 df, p=0.02
Score (logrank) test = 5.2 on 1 df, p=0.02
# include significant predictors in final model
model <- coxph(Surv(time, status) ~ rx + adhere + surg + obstruct + differentiation
+ local_spread, data = colon_surv)
summary(model)Call:
coxph(formula = Surv(time, status) ~ rx + adhere + surg + obstruct +
differentiation + local_spread, data = colon_surv)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.0047151 0.9952960 0.1110663 -0.042 0.966137
rxLev+5FU -0.3588384 0.6984872 0.1191372 -3.012 0.002596 **
adhere 0.1499547 1.1617816 0.1285420 1.167 0.243380
surg 0.2103485 1.2341081 0.1031988 2.038 0.041521 *
obstruct 0.2000040 1.2214077 0.1148937 1.741 0.081723 .
differentiationpoor 0.4507752 1.5695283 0.1217364 3.703 0.000213 ***
differentiationwell -0.0003827 0.9996174 0.1651661 -0.002 0.998151
local_spreadmuscle -0.9927505 0.3705561 0.2551148 -3.891 9.97e-05 ***
local_spreadserosa -0.4291507 0.6510618 0.1987757 -2.159 0.030853 *
local_spreadsubmucosa -1.5035698 0.2223351 0.5385347 -2.792 0.005239 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9953 1.0047 0.80059 1.2373
rxLev+5FU 0.6985 1.4317 0.55303 0.8822
adhere 1.1618 0.8607 0.90304 1.4947
surg 1.2341 0.8103 1.00812 1.5108
obstruct 1.2214 0.8187 0.97513 1.5299
differentiationpoor 1.5695 0.6371 1.23637 1.9925
differentiationwell 0.9996 1.0004 0.72318 1.3817
local_spreadmuscle 0.3706 2.6986 0.22475 0.6110
local_spreadserosa 0.6511 1.5360 0.44099 0.9612
local_spreadsubmucosa 0.2223 4.4977 0.07738 0.6389
Concordance= 0.618 (se = 0.013 )
Likelihood ratio test= 63.62 on 10 df, p=7e-10
Wald test = 60.86 on 10 df, p=2e-09
Score (logrank) test = 63.16 on 10 df, p=9e-10
Test proportional hazard assumption for the model with significant predictors
cox.zph(model) chisq df p
rx 1.8302 2 0.40
adhere 0.1401 1 0.71
surg 0.0596 1 0.81
obstruct 6.6401 1 0.01
differentiation 20.0601 2 4.4e-05
local_spread 6.7511 3 0.08
GLOBAL 35.5839 10 9.9e-05
zph_test <- cox.zph(m2)
print(zph_test) chisq df p
rx 2.4436 2 0.29470
age 0.5116 1 0.47444
sex 0.9111 1 0.33981
perfor 0.6792 1 0.40987
adhere 0.1074 1 0.74310
surg 0.0281 1 0.86691
obstruct 6.1995 1 0.01278
differentiation 17.7305 2 0.00014
node4 5.7169 1 0.01680
local_spread 7.1050 3 0.06862
GLOBAL 40.5778 14 0.00021
# plot the Schoenfeld residuals
plot(zph_test)Insight: Differentiation variable does not meet proportional hazards assumption.
Variable selection method 2: Use the MASS package stepAIC() function for stepwise selection by using AIC (Akaike Information Criterion) as the measure to add or remove predictors from the model.
library(MASS) # for stepwise regression
library(boot) # for cross-validationPerform Stepwise variable selection
# stepwise selection
stepwise_model <- stepAIC(m3, direction = "both")Start: AIC=5738.63
Surv(time, status) ~ rx + adhere + surg + obstruct + differentiation +
node4 + local_spread
Df AIC
- adhere 1 5738.6
<none> 5738.6
- obstruct 1 5741.4
- surg 1 5741.7
- differentiation 2 5743.0
- rx 2 5747.1
- local_spread 3 5749.9
- node4 1 5816.8
Step: AIC=5738.62
Surv(time, status) ~ rx + surg + obstruct + differentiation +
node4 + local_spread
Df AIC
<none> 5738.6
+ adhere 1 5738.6
- obstruct 1 5741.3
- surg 1 5742.1
- differentiation 2 5743.8
- rx 2 5747.2
- local_spread 3 5751.5
- node4 1 5816.1
summary(stepwise_model)Call:
coxph(formula = Surv(time, status) ~ rx + surg + obstruct + differentiation +
node4 + local_spread, data = df)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.006658 0.993365 0.111374 -0.060 0.952333
rxLev+5FU -0.371547 0.689666 0.119513 -3.109 0.001878 **
surg 0.244822 1.277393 0.103165 2.373 0.017639 *
obstruct 0.255151 1.290656 0.115179 2.215 0.026743 *
differentiationpoor 0.381493 1.464469 0.121454 3.141 0.001683 **
differentiationwell 0.062221 1.064197 0.165705 0.375 0.707295
node4 0.908394 2.480335 0.097820 9.286 < 2e-16 ***
local_spreadmuscle -0.977922 0.376092 0.251726 -3.885 0.000102 ***
local_spreadserosa -0.490330 0.612424 0.193922 -2.528 0.011456 *
local_spreadsubmucosa -1.337841 0.262411 0.536169 -2.495 0.012589 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9934 1.0067 0.79856 1.2357
rxLev+5FU 0.6897 1.4500 0.54564 0.8717
surg 1.2774 0.7828 1.04354 1.5636
obstruct 1.2907 0.7748 1.02984 1.6175
differentiationpoor 1.4645 0.6828 1.15425 1.8581
differentiationwell 1.0642 0.9397 0.76908 1.4726
node4 2.4803 0.4032 2.04760 3.0045
local_spreadmuscle 0.3761 2.6589 0.22963 0.6160
local_spreadserosa 0.6124 1.6329 0.41878 0.8956
local_spreadsubmucosa 0.2624 3.8108 0.09175 0.7505
Concordance= 0.669 (se = 0.012 )
Likelihood ratio test= 141.8 on 10 df, p=<2e-16
Wald test = 145.2 on 10 df, p=<2e-16
Score (logrank) test = 156 on 10 df, p=<2e-16
# FINAL MODEL
m2_select <- coxph(formula = Surv(time, status) ~ rx + surg + obstruct +
differentiation + node4 + local_spread, data = df)
summary(m2_select)Call:
coxph(formula = Surv(time, status) ~ rx + surg + obstruct + differentiation +
node4 + local_spread, data = df)
n= 929, number of events= 452
coef exp(coef) se(coef) z Pr(>|z|)
rxLev -0.006658 0.993365 0.111374 -0.060 0.952333
rxLev+5FU -0.371547 0.689666 0.119513 -3.109 0.001878 **
surg 0.244822 1.277393 0.103165 2.373 0.017639 *
obstruct 0.255151 1.290656 0.115179 2.215 0.026743 *
differentiationpoor 0.381493 1.464469 0.121454 3.141 0.001683 **
differentiationwell 0.062221 1.064197 0.165705 0.375 0.707295
node4 0.908394 2.480335 0.097820 9.286 < 2e-16 ***
local_spreadmuscle -0.977922 0.376092 0.251726 -3.885 0.000102 ***
local_spreadserosa -0.490330 0.612424 0.193922 -2.528 0.011456 *
local_spreadsubmucosa -1.337841 0.262411 0.536169 -2.495 0.012589 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
rxLev 0.9934 1.0067 0.79856 1.2357
rxLev+5FU 0.6897 1.4500 0.54564 0.8717
surg 1.2774 0.7828 1.04354 1.5636
obstruct 1.2907 0.7748 1.02984 1.6175
differentiationpoor 1.4645 0.6828 1.15425 1.8581
differentiationwell 1.0642 0.9397 0.76908 1.4726
node4 2.4803 0.4032 2.04760 3.0045
local_spreadmuscle 0.3761 2.6589 0.22963 0.6160
local_spreadserosa 0.6124 1.6329 0.41878 0.8956
local_spreadsubmucosa 0.2624 3.8108 0.09175 0.7505
Concordance= 0.669 (se = 0.012 )
Likelihood ratio test= 141.8 on 10 df, p=<2e-16
Wald test = 145.2 on 10 df, p=<2e-16
Score (logrank) test = 156 on 10 df, p=<2e-16
cox_summary <- tidy(m2_select)Cross-Validation for Model Evaluation
# cross validation function
cv_cox <- function(data, indices) {
# subset data
d <- data[indices, ]
# fit model on subset
fit <- coxph(Surv(time, status) ~ rx + age+ surg + obstruct + differentiation + node4 +
local_spread, data = df)
# return the concordance index (C-index)
return(survConcordance(Surv(time, status) ~ predict(fit), data = d)$concordance)
}
# Set seed for reproducibility
set.seed(45)
# perform 100-fold cross-validation
cv_results <- boot(data = df, statistic = cv_cox, R = 100)
cv_results
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = df, statistic = cv_cox, R = 100)
Bootstrap Statistics :
original bias std. error
t1* 0.6718376 -0.170906 0.01343439
Visualization of cross-validation results: plot concordance index from bootstrap samples
# Print summary of cross-validation results
median(cv_results$t)[1] 0.5024297
# Visualize results
plot(cv_results)Insight: The concordance index (c-index) of the original model (0.67) is higher than the median bootstrap samples concordance (0.5), suggesting overestimation of of model performance on the original dataset. The model predictive power is not needs improvement based on the c-index.
V. Results
Table 2. Univariate model: Survival after Chemotherapy for stage B/C Colon Cancer
| Treatment | Coefficient | Hazard ratio | 95% CI_upper | 95% CI_lower | P-value |
|---|---|---|---|---|---|
| Amisole (Lev) | -0.027 | 0.974 | 0.784 | 1.209 | 0.809 |
| Amisole + 5-FU | -0.372 | 0.690 | 0.546 | 0.870 | 0.002 |
Table 3. Multivariate model: Survival after Chemotherapy for stage B/C Colon Cancer
| Treatment | Coefficient | Hazard ratio | 95% CI_upper | 95% CI_lower | P-value |
|---|---|---|---|---|---|
| Amisole (Lev) | -0.011 | 0.989 | 0.795 | 1.231 | 0.923 |
| Amisole + 5-FU | -0.376 | 0.687 | 0.543 | 0.868 | 0.002 |
| Age | 0.007 | 1.007 | 0.999 | 1.015 | 0.069 |
| Surge | 0.244 | 1.276 | 1.042 | 1.562 | 0.018 |
| Obstruction of colon | 0.283 | 1.327 | 1.057 | 1.667 | 0.015 |
| Differentiat ion_poor | 0.374 | 1.453 | 1.145 | 1.844 | 0.002 |
| Differentiat ion_well | 0.069 | 1.072 | 0.774 | 1.483 | 0.677 |
| More than 4 nodes (+) | 0.930 | 2.534 | 2.089 | 3.074 | 3.75 x 10-21 |
| Local spread_muscle | -0.996 | 0.370 | 0.225 | 0.606 | 7.85 x 10-5 |
| Local spread_serosa | -0.501 | 0.606 | 0.414 | 0.886 | 0.010 |
| Local spread_submucosa | -1.322 | 0.267 | 0.093 | 0.763 | 0.014 |